# loading packages
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggmap)
## Warning: package 'ggmap' was built under R version 3.4.4
## Loading required package: ggplot2
library(ggplot2)
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 3.4.4
## Loading required package: magrittr
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:ggmap':
## 
##     inset
library(ggdendro)
## Warning: package 'ggdendro' was built under R version 3.4.4
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 3.4.4
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(RColorBrewer)
library(wesanderson)
## Warning: package 'wesanderson' was built under R version 3.4.4

Seattle Parks and Recreational Data – Trails

# read-in data
trails = read.csv('Trails.csv')
trails1 = select(trails, PMA_NAME, TRAIL_ID, PMAID, TRAIL_NUM,
                 SURFACE_TY, CONDITION, WIDTH, CANOPY, GRADE_TYPE,
                 GRADE_PERC, SHAPE_LENG)
# sort by 'PAMID'--> 'Park_ID' and 'TRAIL_NUM'-->'individual id for
# each trail in the park'
trails1 = arrange(trails1, PMAID, TRAIL_NUM)
colnames(trails1)[2] = 'ID'; colnames(trails1)[3] = 'Park_ID'; 
colnames(trails1)[4] = 'Trail_ID'

trails2 = trails1 # get a copy of trails1 just in case

# data summary
summary(trails1)  # note something with 'WIDTH'
##                           PMA_NAME          ID          Park_ID      
##  DISCOVERY PARK               : 245   404-11 :   3   Min.   : 234.0  
##  WARREN G. MAGNUSON PARK      : 216   416-8  :   3   1st Qu.: 310.0  
##  WESTCREST PARK               : 202   282-29 :   2   Median : 398.0  
##  WASHINGTON PARK AND ARBORETUM: 182   310-363:   2   Mean   : 694.7  
##  LINCOLN PARK                 : 159   327-10 :   2   3rd Qu.: 460.0  
##  CAMP LONG                    : 125   347-14 :   2   Max.   :4479.0  
##  (Other)                      :1495   (Other):2610                   
##     Trail_ID         SURFACE_TY       CONDITION        WIDTH       
##  Min.   :  1.00   Gravel  :1213   Eroded   :  70   Min.   :  1.00  
##  1st Qu.: 13.00   Soil    : 615   Good     :1443   1st Qu.:  3.00  
##  Median : 40.00   Concrete: 275   Overgrown:  67   Median :  4.00  
##  Mean   : 70.64   Stairs  : 190   Poor     : 128   Mean   :  4.24  
##  3rd Qu.:103.00   Asphalt : 145   Worn     : 916   3rd Qu.:  5.00  
##  Max.   :391.00   Grass   :  63                    Max.   :103.00  
##                   (Other) : 123                                    
##     CANOPY        GRADE_TYPE    GRADE_PERC       SHAPE_LENG      
##  Closed: 330   Flat    :739   Min.   : 0.000   Min.   :   2.516  
##  High  : 747   Gradual :986   1st Qu.: 3.000   1st Qu.:  88.489  
##  Low   : 323   Moderate:608   Median : 5.000   Median : 177.548  
##  Open  :1224   Steep   :291   Mean   : 4.714   Mean   : 279.527  
##                               3rd Qu.: 7.000   3rd Qu.: 344.706  
##                               Max.   :20.000   Max.   :4584.925  
## 
# check the numeric 'WIDTH' distribution
ggplot(trails1, aes(x=WIDTH)) + geom_bar()

# obs with WIDTH=103 might be outlier
# remove outlier obs
trails1 = filter(trails1, WIDTH != 103)

Initial Thoughts

After this simple data summary, I came up with some questions:

  • Questions1: What is the variable ‘GRADE_PER’? Seems to be closely related to ‘GRADE_TYPE’. I could not find complete metadata from the data source: City of Seattle (Seattle.gov), so I could not get offical explanation on this variable.

  • Question2: Are there any relationships among the trail data attributes (‘SURFACE_TYPE’, ‘CONDITION’,‘WIDTH’, ‘CANOPY’, ‘GRADE_TYPE’ and ‘GRADE_PERC’) for different city parks in Seattle? Particularly, which ones have relatively stronger relationships with trail condition?

  • Question3: Any parks share common trail features? Are they close to each other? What are their features? Are these information valuable for future city planning?

Analysis Plan

  1. I am going to do an exploratory data analysis using PCA to figure out which trail attributes are more related to each other. Meanwhile, find some evidence to show GRADE_PERC and GRADE_TYPE are basically representing the same trail feature.
  • One challenge is the prcomp() only takes numeric variables. Thus, I need to first of all transform those categorical variables into ordinal ones. I can generally stick to intuition as most of those factor variables are initially ordinal as well. However, for 10 different SURFACE TYPEs, I can’t decide the classification scheme with order easily. Some surface types share a common feature such as low cost; some surface types are all more durable; some are easier to build… I can’t say which features are more important. Therefore, I need some additional steps to help me find out a proper classification scheme for SURFACE TYPE. I decided to do document clustering using vector space modeling.
  1. Find trail attributes representatives for each park and use these information to do hierarchical clustering. This may provide some insights on how parks are different with respect to their trails.

  2. Do a ‘map matrix’ for interesting trail attributes based on the PCA results from step1.

Data preprocessing and PCA

Prepare Ordinal Variables

# Check 'Grade_type' levels and transform to ordinal numbers
# Flat--1, gradual--2, moderate--3, steep--4 
levels(trails1$GRADE_TYPE)
## [1] "Flat"     "Gradual"  "Moderate" "Steep"
levels(trails1$GRADE_TYPE) = c(1,2,3,4)
# Check 'Canopy' levels and transform to ordinal numbers
# Closed--4, high--3, low--2, open--1
levels(trails1$CANOPY)
## [1] "Closed" "High"   "Low"    "Open"
levels(trails1$CANOPY) = c(4,3,2,1)
# Check 'Condition' levels and transform to ordinal numbers
# Worn and erode -- 1, good--4, overgrown--3, poor--2
levels(trails1$CONDITION)
## [1] "Eroded"    "Good"      "Overgrown" "Poor"      "Worn"
levels(trails1$CONDITION) = c(1, 4, 3, 2, 1)

# Check 'Surface type' but found no clue for giving different categorical
# levels an order and transform them into a Likert Scale

# Try text clustering
levels(trails1$SURFACE_TY)
##  [1] "Asphalt"     "Bark"        "Boardwalk"   "Bridge"      "Check Steps"
##  [6] "Concrete"    "Grass"       "Gravel"      "Soil"        "Stairs"

Surface Types Classification

# Vector Space Modeling
# Code modified from the code provided by instructor during the class
# I couldn't find proper document to describe grass-type trail so I would
# intuitively group it with soil as both of them sound close to nature

d1 = scan("asphalt.txt", what = character())
d2 = scan("bark.txt", what = character())
d3 = scan("bridge.txt", what = character())
d4 = scan("broadwalk.txt", what = character())
d5 = scan("check_step.txt", what = character())
d6 = scan("concrete.txt", what = character())
d7 = scan("gravel.txt", what = character())
d8 = scan("soil.txt", what = character())
d9 = scan("step.txt", what = character())

# stop words to be removed from the text documents
# I remove the terms appeared in surface type names as well
# because 1. lots of ducuments seem to compare specific surface types
# 2. I am more interested in using other surface type properties to
# classify these surface types rather than using their names

rm.words = c("the", "The", "of", "it", "A", "a", "and", "to", "or",
             "in", "on", "by", "as", "if", "for", "is", "are", "It", 
             "from", "that", "be", "do", "also", "an", "An", "with",
             "asphalt", "bark", "bridge", "broadwalk", "step",
             "concrete", "gravel", "soil")

# TRUE indicates elements of x not present in table
"%not.in%" = function(x, tab) match(x, tab, nomatch = 0) == 0

## pass in a document, get back cleaned document
clean.doc = function(d) {

    ## remove punctuation and numbers
    d = gsub("[[:punct:]]+", "", d)
    d = gsub("[[:digit:]]+", "", d)
    d = d[ d != "" ]            # remove empty strings
    d = d[ d %not.in% rm.words ] # remove stop words
    tolower(d) # convert to lower-case
    
}

d1 = clean.doc(d1)
d2 = clean.doc(d2)
d3 = clean.doc(d3)
d4 = clean.doc(d4)
d5 = clean.doc(d5)
d6 = clean.doc(d6)
d7 = clean.doc(d7)
d8 = clean.doc(d8)
d9 = clean.doc(d9)

## Unique terms from all documents
ds = paste("d", 1:9, sep="")
terms = unique(c(d1,d2,d3,d4,d5,d6,d7,d8,d9))
nterms = length(terms)  # number of terms
nds = 9               # number of documents

## For this specific document, get occurrence frequency of each term 
get.count = function(d) {
    cnt = numeric(nterms)  # will hold the counts
    for (i in 1:nterms) { # for each term
        for (t in d) {    # for each term in doc
            if (t == terms[i]) {
            cnt[i] = cnt[i] + 1
            }
        }
    }
    cnt
}

## tf values (the number of times a term appears in a documet)
tf1 = get.count(d1)
tf2 = get.count(d2)
tf3 = get.count(d3)
tf4 = get.count(d4)
tf5 = get.count(d5)
tf6 = get.count(d6)
tf7 = get.count(d7)
tf8 = get.count(d8)
tf9 = get.count(d9)

tfs = cbind(tf1, tf2, tf3, tf4, tf5, tf6, tf7, tf8, tf9)
dimnames(tfs) = list(terms, ds)

## the document frequency values 
# (the number of documents that contain a term)
dfs = apply(tfs, 1, function(x) sum(x>0))

## compute the idf(inverse document frequency) values
idf = log(nds/dfs, base=10)

## compute the tf-idf weights, i.e. the term-document matrix
td.mat = apply(tfs, 2, function(x) x*idf)

## cosine similarity
cos.simi = function(x,y) (x%*%y)/(sqrt(sum(x^2)*sum(y^2)))

## compute the similarities between all pairs of docs
S.mat = matrix(0, nrow=nds, ncol=nds) # will hold the similarities

for (i in 1:nds) {
    for (j in 1:nds) {
        x = td.mat[,i]
        y = td.mat[,j]
        S.mat[i,j] = cos.simi(x,y)
    }
}

# append the names back
ds1 = c("asphalt", "bark", "bridge", "broadwalk", "check_step",
        "concrete", "gravel", "soil", "stairs")

dimnames(S.mat) = list(ds1, ds1)
D = as.dist(1 - S.mat)

## agglomerative hierarchical clustering
hc = hclust(D, method="average")
plot(hc)

hc1 = hclust(D, method = "complete")
plot(hc1)

Choose to use complete linkage and the results are:

  • Cluster1: Soil, grass (natural earth, soft, cheaper, level of human impact: 1)

  • Cluster2: bark, broadwalk, check step, stairs (neutral, relatively soft, smaller constructions, level of human impact: 2)

  • Cluster3: bridge, asphalt, concrete, gravel (hard materials, larger constructions, least natural earth to look at, level of human impact: 3)

# transform the 'Surface_ty' to ordinal
# according to the above clustering results
levels(trails1$SURFACE_TY)
##  [1] "Asphalt"     "Bark"        "Boardwalk"   "Bridge"      "Check Steps"
##  [6] "Concrete"    "Grass"       "Gravel"      "Soil"        "Stairs"
levels(trails1$SURFACE_TY) = c(3, 2, 2, 3, 2, 3, 1, 3, 1, 2)

PCA

trails1[,5:10] = apply(trails1[,5:10], 2, as.numeric)

# Run PCA
pc.trail = prcomp(trails1[,5:11], scale = TRUE,center=TRUE)
summary(pc.trail)
## Importance of components:
##                           PC1    PC2    PC3    PC4     PC5     PC6    PC7
## Standard deviation     1.5144 1.1955 1.0453 0.9796 0.76487 0.69924 0.3888
## Proportion of Variance 0.3276 0.2042 0.1561 0.1371 0.08357 0.06985 0.0216
## Cumulative Proportion  0.3276 0.5318 0.6879 0.8250 0.90856 0.97840 1.0000
# Proportion of Variance Explained (PVE) code from text book
pve =100* pc.trail$sdev ^2/ sum(pc.trail$sdev ^2)
par(mfrow =c(1,2))
plot(pve, type ="o", ylab="PVE", xlab="Principal Component",
col ="red")
plot(cumsum (pve), type="o", ylab =" Cumulative PVE", xlab="
Principal Component", col = "darkblue")

# It turned out the first three principal components are adequate
# to explain about 70% of the variation in the data.
# First two principal components can help explain over 50% of the variation.

# Loading vectors for the first three principal components
pc.trail$rotation[,1:3]
##                   PC1         PC2         PC3
## SURFACE_TY -0.2664147  0.54985242  0.15028293
## CONDITION  -0.3003709 -0.08559298 -0.69801083
## WIDTH      -0.2477839  0.64937712  0.05105992
## CANOPY      0.4085213  0.07555024  0.49821832
## GRADE_TYPE  0.5558279  0.23902742 -0.33018694
## GRADE_PERC  0.5377682  0.29129634 -0.34855880
## SHAPE_LENG -0.1112421  0.34777868 -0.09407790
# biplot
pcr = data.frame(pc.trail$rotation[,1:2])

ggplot()+geom_point(aes(PC1, PC2), size=0.2, data=data.frame(pc.trail$x, alpha = 0.7)) + 
annotate(geom="segment", x=0, y=0, xend=pcr$PC1*10, yend=pcr$PC2*10, alpha = 0.9,
         color = "palegreen4", size = 0.8) +
annotate(geom="text", x=pcr$PC1*11, y=pcr$PC2*11, label=rownames(pcr), 
         alpha=0.9, size =3.5, color = "palegreen4")

  • GRADE_TYPE and GRADE_PERC –> PC1 –> topographic feature of the trail
  • GRADE_TYPE and GRADE_PERC have the largest loading values, 0.56 and 0.54. They are both positive and very close in magnitude. So GRADE_PER should be highly correlated with GRADE_TYPE. Observations(trail segments) with higher PC1 scores tend to have higher GRADE_TYPE or GRADE_PERC values (in other words, steeper).

  • SURFACE TYPE, WIDTH and SHAPE_LENGTH –> PC2 –> initial trail setup
  • SURFACE TYPE and WIDTH have the largest loadings, 0.55 and 0.65. They are both positive and a little bit off in magnitude. But they are relatively more correlated with each other than with other variables. Trail segments with higher PC2 scores tend to have ‘higher’ surface type value (harder materials, more expensive and greater human impact) and larger width. SHAPE_LENGTH’s loading value is around 0.35 which is not very close to the ones for surface type and width. So I deem it not very important.

  • CONDITION and CANOPY – PC3 – trail usage and maintenance (still think important)
  • CONDITION and CANOPY have the largest loadings in magnitude, 0.70 (-0.698) and 0.50 (0.498). However, they have opposite signs. Thus, these variables are negatively correlated with each other. Trail segments with higher PC3 scores tend to have ‘higher’ canopy values (more tree canopies) and ‘lower’ condition values (worse trail condition).

Representative Trail Features and Park Clusters

Representative Trail Features

## Try to find parks with similar trail features
## Form a new dataset for plot and clustering

# decide trail 'surface type' for each park by maximum total shape length
trails_surf = trails1 %>% group_by(PMA_NAME, SURFACE_TY) %>% 
  summarise(tot_length = sum(SHAPE_LENG)) %>% 
  filter(tot_length == max(tot_length))

# decide trail 'condition' for each park by maximum total shape length
trails_cond = trails1 %>% group_by(PMA_NAME, CONDITION) %>% 
  summarise(tot_length = sum(SHAPE_LENG))%>% 
  filter(tot_length == max(tot_length))

# decide trail 'width' for each park by maximum total shape length
trails_width = trails1 %>% group_by(PMA_NAME, WIDTH)%>% 
  summarise(tot_length = sum(SHAPE_LENG))%>% 
  filter(tot_length == max(tot_length))

# decide trail 'canopy' type for each park by maximum total shape length
trails_canopy = trails1 %>% group_by(PMA_NAME, CANOPY) %>% 
  summarise(tot_length = sum(SHAPE_LENG))%>% 
  filter(tot_length == max(tot_length))

# decide trail 'grade_type' for each park by maximum total shape length
trails_grade = trails1 %>% group_by(PMA_NAME, GRADE_TYPE) %>% 
  summarise(tot_length = sum(SHAPE_LENG)) %>% 
  filter(tot_length == max(tot_length))

# total length of trails in each park
trails_length = trails1 %>% group_by(PMA_NAME) %>% 
  summarise(TOT_LENGTH = sum(SHAPE_LENG))

# grade_perc
trails_perc = trails1 %>% group_by(PMA_NAME, GRADE_PERC) %>% 
  summarise(tot_length = sum(SHAPE_LENG)) %>% 
  filter(tot_length == max(tot_length))

Park Clusters

# Form trails3 for clustering the parks
trails3 = bind_cols(trails_surf[,1:2], trails_cond[,2], trails_width[,2],
                    trails_canopy[,2], trails_grade[,2], 
                    trails_perc[,2], trails_length[,2])

trails3 = as.data.frame(trails3)
row.names(trails3) = trails3$PMA_NAME
trails3 = trails3[,-1]
# matrix to store similarity values
park_trail_simi = matrix(0, nrow=nrow(trails3), ncol=nrow(trails3)) 

for (i in 1:nrow(trails3)) {
    for (j in 1:nrow(trails3)) {
        x = as.numeric(trails3[i,])
        y = as.numeric(trails3[j,])
        park_trail_simi[i,j] = cos.simi(x,y)
    }
}
dimnames(park_trail_simi) = list(rownames(trails3), rownames(trails3))
# distances
D_park = as.dist(1 - park_trail_simi)

## hierarchical clustering and dendrogram
hc2 = hclust(D_park, method="complete")
ggdendrogram(hc2, rotate = TRUE, theme_dendro = FALSE) + geom_hline(aes(yintercept=2.7e-05), linetype = "dashed")

# PEPPI'S PLAYGROUND is isolated from the rest 
# difficult to see other clusters

# remove PEPPI'S PLAYGROUND 
trails4 = trails3[-46,]

# Recalculate the similarity matrix and distances
park_trail_simi1 = matrix(0, nrow=nrow(trails4), ncol=nrow(trails4)) 

for (i in 1:nrow(trails4)) {
    for (j in 1:nrow(trails4)) {
        x = as.numeric(trails4[i,])
        y = as.numeric(trails4[j,])
        park_trail_simi1[i,j] = cos.simi(x,y)
    }
}
dimnames(park_trail_simi1) = list(rownames(trails4), rownames(trails4))
D_park1 = as.dist(1 - park_trail_simi1)

## hierarchical clustering and dendrogram
hc3 = hclust(D_park1, method="complete")
ggdendrogram(hc3, rotate = TRUE, theme_dendro = FALSE) +
  geom_hline(yintercept = 2.69e-05, linetype = 'dashed')

Visualizing Features on Maps

# Visualize those parks on map
parks = rownames(trails4)
parks = paste(parks, 'Seattle,WA', sep=",")

# geocode the park locations
park_geoc = geocode(parks, override_limit = TRUE)

# if NA appears regeocode until all NAs are filled with values
park_geoc[which(is.na(park_geoc$lon)),] = 
      geocode(parks[which(is.na(park_geoc$lon))])

# store as text file in case need to regeocode (takes long time)
sink("park_geoc.txt")
write.table(park_geoc, quote=FALSE, row.names=FALSE,
col.names=TRUE, file="")
sink()
parkntrail = bind_cols(park = parks, park_geoc, trails4)

# Seattle_KingCT = get_googlemap('Seattle', zoom = 10, scale = 2,
# type = 'hybrid', override_limit = TRUE)

# base_map = ggmap(Seattle_KingCT)
# save(base_map, file = "basemap.RData")
load("basemap.RData") # in case some error appears when running get_googlemap

## Grade_Type

parkntrail$GRADE_TYPE = as.factor(parkntrail$GRADE_TYPE)

plot1 = base_map + scale_x_continuous(limits = c(-122.6, -122.0), expand = c(0, 0)) +
  scale_y_continuous(limits = c(47.4, 47.8), expand = c(0, 0)) +
  geom_point(aes(x=lon, y=lat, color = GRADE_TYPE), size = 2.5, alpha = 0.9,
             data = parkntrail) + coord_fixed(1.6) +
   scale_color_brewer(name = "GRADE", palette="RdYlGn", direction = -1)
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which
## will replace the existing scale.
## Surface_Type 

parkntrail$SURFACE_TY = as.factor(parkntrail$SURFACE_TY)

plot2 = base_map + scale_x_continuous(limits = c(-122.6, -122.0), expand = c(0, 0)) +
  scale_y_continuous(limits = c(47.4, 47.8), expand = c(0, 0)) + 
  geom_point(aes(x=lon, y=lat, color = SURFACE_TY), size = 2.5, alpha = 0.9,
             data = parkntrail) + 
  scale_color_manual(name = "SURFACE", values=wes_palette(n=3, name="Zissou1",
  type = "continuous"))+ coord_fixed(1.6) 
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which
## will replace the existing scale.
## WIDTH

plot3 = base_map + scale_x_continuous(limits = c(-122.6, -122.0), expand = c(0, 0)) +
  scale_y_continuous(limits = c(47.4, 47.8), expand = c(0, 0)) + 
  geom_point(aes(x=lon, y=lat, size= WIDTH), color = 'steelblue4', alpha = 0.6,
             data = parkntrail) + coord_fixed(1.6)
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which
## will replace the existing scale.
## CANOPY

parkntrail$CANOPY = as.factor(parkntrail$CANOPY)

plot4 = base_map + scale_x_continuous(limits = c(-122.6, -122.0), expand = c(0, 0)) +
  scale_y_continuous(limits = c(47.4, 47.8), expand = c(0, 0)) + 
  geom_point(aes(x=lon, y=lat, color = CANOPY), size = 2.5, alpha = 0.9,
             data = parkntrail) + coord_fixed(1.6) +
   scale_color_brewer(palette="PiYG", direction = 1)
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which
## will replace the existing scale.
## CONDITION
# Worn and erode -- 1, poor--2, overgrown--3, good--4

parkntrail$CONDITION = as.factor(parkntrail$CONDITION)

plot5 = base_map + scale_x_continuous(limits = c(-122.6, -122.0), expand = c(0, 0)) +
  scale_y_continuous(limits = c(47.4, 47.8), expand = c(0, 0)) +
  geom_point(aes(x=lon, y=lat, color = CONDITION), size = 2.5, alpha = 0.9,
             data = parkntrail) + coord_fixed(1.6) +
  scale_color_manual(values=wes_palette(n=5, name="Darjeeling1")[c(1, 4, 2)])
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which
## will replace the existing scale.
# Append the ggmap of CONDITION to each of the other trail attribute ggmaps
ggarrange(plot5, plot1, ncol = 2, nrow = 1, widths = c(1.08,1), heights = c(1.05,1))
## Warning: Removed 1 rows containing missing values (geom_rect).

## Warning: Removed 1 rows containing missing values (geom_rect).

  • Most parks with steep type(GRADE:4) trails have the worst trail condition (CONDITION:1, worn/eroded). Most parks with flat type(GRADE:1) trails have the best trail consition (CONDITION: 4, good). It makes sense that steeper slopes are more likely to have erosion.
ggarrange(plot5, plot2, ncol = 2, nrow = 1, widths = c(1.05,1), heights = c(1.05,1))
## Warning: Removed 1 rows containing missing values (geom_rect).

## Warning: Removed 1 rows containing missing values (geom_rect).

  • It looks like trails in each surface type can have all different types of trail condition. Trails with greater construction using harder materials are more correlated with better trail conditon.
ggarrange(plot5, plot3, ncol = 2, nrow = 1, widths = c(1.1,1), heights = c(1.05,1))
## Warning: Removed 1 rows containing missing values (geom_rect).

## Warning: Removed 1 rows containing missing values (geom_rect).

  • It looks like trails with different width can have all different types of trail condition.
ggarrange(plot5, plot4, ncol = 2, nrow = 1, widths = c(1.05,1), heights = c(1.05,1))
## Warning: Removed 1 rows containing missing values (geom_rect).

## Warning: Removed 1 rows containing missing values (geom_rect).

  • Open canopy environment(CANOPY: 1, open) are more associated with good trail condition(CONDITION:4, good); the greater canopy coverage(CANOPY: 2,3,4) the more likely to have bad trail condition(CONDITION:1).

Visualizing Park Clusters on Maps

# Append the ggmap of CLUSTER to each of the other trail attribute ggmaps
# plot the CLUSTERs from clustering analysis
# Park clusterA: WEST DUWAMISH GREENBELT, SEOLA BEACH GREENBELT,
#                THORNTON CDREEK PARKS #2

# Park clusterB: PEPPIS PLAYGROUND, HITT'S HILL PARK, 
#                LAKE PEOPLE PARK(XACUA'BS)

# Park clusterC: LICORICE FERN NA ON TC, CEDAR PARK, DEARBORN PARK,
#                MARTHA WASHINGTON PARK, BHY KRACKE PARK, NORTH EAST
#                QUEEN ANNE GREENBELT, LONGFELLOW CREEK GS:SOUTH,
#                ST.MARKS GREENBELT, BOREN PARK, DUWAMISH HEAD GREENBELT

# Park clusterD: the rest of parks

parkntrail$park = rownames(trails4)

cluA = which(parkntrail$park %in% 
  c("WEST DUWAMISH GREENBELT", "SEOLA BEACH GREENBELT", "THORNTON CREEK PARK #2"))

cluB = which(parkntrail$park %in%
  c("PEPPIS PLAYGROUND", "HITT'S HILL PARK", "LAKE PEOPLE PARK (XACUA'BS)"))

cluC = which(parkntrail$park %in% 
  c("LICORICE FERN NA ON TC", "CEDAR PARK", "DEARBORN PARK", "MARTHA WASHINGTON PARK",
    "BHY KRACKE PARK", "NORTH EAST QUEEN ANNE GREENBELT", "LONGFELLOW CREEK GS: SOUTH",
    "ST. MARKS GREENBELT", "BOREN PARK", "DUWAMISH HEAD GREENBELT"))

cluD = (1:nrow(parkntrail))[-c(cluA,cluB,cluC)]

parkntrail$CLUSTER = numeric(nrow(parkntrail))
parkntrail$CLUSTER[cluA] = "A"
parkntrail$CLUSTER[cluB] = "B"
parkntrail$CLUSTER[cluC] = "C"
parkntrail$CLUSTER[cluD] = "D"

## ClUSTERs on a map

parkntrail$CLUSTER = as.factor(parkntrail$CLUSTER)

plot6 = base_map + scale_x_continuous(limits = c(-122.6, -122.0), expand = c(0, 0)) +
  scale_y_continuous(limits = c(47.4, 47.8), expand = c(0, 0)) + 
  geom_point(aes(x=lon, y=lat, color = CLUSTER), size = 2.5, alpha = 0.9, 
      data = parkntrail) + coord_fixed(1.6)+
  scale_color_manual(values=c(wes_palette(n=5, name="BottleRocket2")[c(1,2)],
  wes_palette(n=4, name="GrandBudapest1")[2], 
  wes_palette(n=5, name="BottleRocket2")[4]))
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which
## will replace the existing scale.
plot6
## Warning: Removed 1 rows containing missing values (geom_rect).

  • Cluster-A parks are generally located in the southwest

  • Cluster-B parks are around central southeast

  • Cluster-C parks are found from north to south but many of them are not close to any shorelines

  • Cluster-D parks are everywhere and most of them are along the shorelines.

# Append the ggmap of CLUSTER to each of the other trail attribute ggmaps
ggarrange(plot6, plot1, ncol = 2, nrow = 1, widths = c(1.05,1), heights = c(1,1))
## Warning: Removed 1 rows containing missing values (geom_rect).

## Warning: Removed 1 rows containing missing values (geom_rect).

ggarrange(plot6, plot2, ncol = 2, nrow = 1, widths = c(1,1), heights = c(1,1))
## Warning: Removed 1 rows containing missing values (geom_rect).

## Warning: Removed 1 rows containing missing values (geom_rect).

ggarrange(plot6, plot3, ncol = 2, nrow = 1, widths = c(1.05,1), heights = c(1.05,1))
## Warning: Removed 1 rows containing missing values (geom_rect).

## Warning: Removed 1 rows containing missing values (geom_rect).

ggarrange(plot6, plot4, ncol = 2, nrow = 1, widths = c(1.05,1), heights = c(1.05,1))
## Warning: Removed 1 rows containing missing values (geom_rect).

## Warning: Removed 1 rows containing missing values (geom_rect).

ggarrange(plot6, plot5, ncol = 2, nrow = 1, widths = c(1,1), heights = c(1,1))
## Warning: Removed 1 rows containing missing values (geom_rect).

## Warning: Removed 1 rows containing missing values (geom_rect).

  • Cluster-A parks: gradual or moderate slopes, natural earth surface, narrow trail width, have canopy and some with closed canopy, very bad trail condition.

  • Cluster-B parks: non-flat topography, hard and durable constructed surface, moderate width, open canopy, very good trail condition.

  • Cluster-C parks: mostly non-flat slopes, some have natural earth surface some have constructed surface, very few high or closed canopy, mostly good condition.

  • Cluster-D parks: a mixture of every type, generally flatter topography, constructed surface, larger width, mostly open canopy, some have good condition and some have the worst condition.

Note that cluster C and D are relatively hard to discriminate with each other as these two clusters got combined at relatively shorter distance. They have a lot in common. On the contrary, cluster A and cluster B are quite different except that neither of them have flat slopes. This makes sense as well since they have greater height of fusion, recall the dendrogram.

Some valuable information:

  • Cluster-A parks: WEST DUWAMISH GREENBELT, SEOLA BEACH GREENBELT and THORNTON CDREEK PARKS #2 need more attention for better maintenance or upgrade.

  • Cluster-B parks: PEPPIS PLAYGROUND, HITT’S HILL PARK and LAKE PEOPLE PARK(XACUA’BS) have good conditions and these can be explained by the corresponding features: flat ground(less likely to have erosion), durable surface types with proper width, no canopy (less likely to have overgrown, lower moisture).

  • May need to further group the parks in cluster D to seperate those with good condition trails and those with worse condition trails.

# take geographic location into consideration

trails4 = data.frame(trails4, park_geoc)
# Recalculate the similarity matrix and distances
park_trail_simi2 = matrix(0, nrow=nrow(trails4), ncol=nrow(trails4)) 

for (i in 1:nrow(trails4)) {
    for (j in 1:nrow(trails4)) {
        x = as.numeric(trails4[i,])
        y = as.numeric(trails4[j,])
        park_trail_simi2[i,j] = cos.simi(x,y)
    }
}
dimnames(park_trail_simi2) = list(rownames(trails4), rownames(trails4))
D_park2 = as.dist(1 - park_trail_simi2)

## hierarchical clustering and dendrogram
hc4 = hclust(D_park2, method="complete")
ggdendrogram(hc4, rotate = TRUE, theme_dendro = FALSE) + 
  geom_hline(yintercept = 0.005, linetype = 'dashed')

# Park clusterE: WEST DUWAMISH GREENBELT

# Park clusterF: PEPPIS PLAYGROUND, HITT'S HILL PARK, LICORICE FERN NA ON TC,
#                LAKE PEOPLE PARK(XACUA'BS), THORNTON CREEK PARK #2

# Park clusterG: DUWAMISH HEAD GREENBELT, BOREN PARK, CHEASTY GREENSPACE: MT VIEW,
#                LONGFELLOW CREEK GS: NORTH, CHEASTY GREENSPACE, PRITCHARD ISLAND 
#                BEACH, LONGFELLOW CREEK GREENSPACE, SOLSTICE PARK, DEARBORN PARK,
#                ST. MARKS GREENBELT, MARTHA WASHINGTON PARK, NORTH EAST QUEEN ANNE GREENBELT, BHY KRACKE PARK, LONGFELLOW CREEK GS: SOUTH, SW QUEEN ANNE GREENBELT, LAKEVIEW PARK, KIWANIS MEMORIAL PRESERVE PARK, CEDAR PARK, SEOLA BEACH GREENBELT

# Park clusterH: the rest of parks

cluE= which(parkntrail$park %in%
 c("WEST DUWAMISH GREENBELT"))

cluF = which(parkntrail$park %in% 
  c("PEPPIS PLAYGROUND", "HITT'S HILL PARK", "LICORICE FERN NA ON TC",
    "LAKE PEOPLE PARK (XACUA'BS)", "THORNTON CREEK PARK #2"))

cluG = which(parkntrail$park %in% 
  c("DUWAMISH HEAD GREENBELT", "BOREN PARK", "CHEASTY GREENSPACE: MT VIEW",
    "LONGFELLOW CREEK GS: NORTH", "CHEASTY GREENSPACE", "PRITCHARD ISLAND BEACH",
    "LONGFELLOW CREEK GREENSPACE", "SOLSTICE PARK", "DEARBORN PARK", 
    "ST. MARKS GREENBELT", "MARTHA WASHINGTON PARK", 
    "NORTH EAST QUEEN ANNE GREENBELT", "BHY KRACKE PARK", 
    "LONGFELLOW CREEK GS: SOUTH", "SW QUEEN ANNE GREENBELT",
    "LAKEVIEW PARK", "KIWANIS MEMORIAL PRESERVE PARK", "CEDAR PARK", 
    "SEOLA BEACH GREENBELT"))

cluH = (1:nrow(parkntrail))[-c(cluE,cluF,cluG)]

parkntrail$CLUSTER2 = numeric(nrow(parkntrail))
parkntrail$CLUSTER2[cluE] = "E"
parkntrail$CLUSTER2[cluF] = "F"
parkntrail$CLUSTER2[cluG] = "G"
parkntrail$CLUSTER2[cluH] = "H"

## ClUSTERs on a map again

parkntrail$CLUSTER2 = as.factor(parkntrail$CLUSTER2)

plot7 = base_map + scale_x_continuous(limits = c(-122.6, -122.0), expand = c(0, 0)) +
  scale_y_continuous(limits = c(47.4, 47.8), expand = c(0, 0)) + 
  geom_point(aes(x=lon, y=lat, color = CLUSTER2), size = 2.5, alpha = 0.9,
             data = parkntrail) + coord_fixed(1.6)+
  scale_color_manual(values=c(wes_palette(n=5, name="BottleRocket2")[c(1,2)],
          wes_palette(n=4, name="GrandBudapest1")[2], 
          wes_palette(n=5, name="BottleRocket2")[4]))
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which
## will replace the existing scale.
plot7
## Warning: Removed 1 rows containing missing values (geom_rect).

ggarrange(plot7, plot6, ncol = 2, nrow = 1, widths = c(1.05,1), heights = c(1.05,1))
## Warning: Removed 1 rows containing missing values (geom_rect).

## Warning: Removed 1 rows containing missing values (geom_rect).

  • Little improvement if adding longitude and latitude to calculate similarity and distance. More parks being clustered in the central and southern parts. Maybe the geographic location difference is already explained by one or some of the trail features such as canopy and grade type.

‘Map Matrix’: Condition, Canopy and Grade

# big 4-by-4 map matrix, need 16 datasets

# Open--1, low--2, high--3, Closed--4
# Flat--1, gradual--2, moderate--3, steep--4 

# 1. open canopy -- flat
open_flat = filter(parkntrail, CANOPY == 1, GRADE_TYPE == 1)

# 2. open canopy -- gradual
open_gradual = filter(parkntrail, CANOPY == 1, GRADE_TYPE == 2)

# 3. open canopy -- moderate
open_moderate = filter(parkntrail, CANOPY == 1, GRADE_TYPE == 3)

# 4. open canopy -- steep 
open_steep = filter(parkntrail, CANOPY == 1, GRADE_TYPE == 4)

# 5. low canopy -- flat
low_flat = filter(parkntrail, CANOPY == 2, GRADE_TYPE == 1)

# 6. low canopy -- gradual
low_gradual = filter(parkntrail, CANOPY == 2, GRADE_TYPE == 2)

# 7. low canopy -- moderate
low_moderate = filter(parkntrail, CANOPY == 2, GRADE_TYPE == 3)

# 8. low canopy -- steep
low_steep = filter(parkntrail, CANOPY == 2, GRADE_TYPE == 4)

# 9. high canopy -- flat
high_flat = filter(parkntrail, CANOPY == 3, GRADE_TYPE == 1)

# 10. high canopy -- gradual
high_gradual = filter(parkntrail, CANOPY == 3, GRADE_TYPE == 2)

# 11. high canopy -- moderate
high_moderate = filter(parkntrail, CANOPY == 3, GRADE_TYPE == 3)

# 12. high canopy -- steep
high_steep = filter(parkntrail, CANOPY == 3, GRADE_TYPE == 4)

# 13. closed canopy -- flat
closed_flat = filter(parkntrail, CANOPY == 4, GRADE_TYPE == 1)

# 14. closed canopy -- gradual
closed_gradual = filter(parkntrail, CANOPY == 4, GRADE_TYPE == 2)

# 15. closed canopy -- moderate
closed_moderate = filter(parkntrail, CANOPY == 4, GRADE_TYPE == 3)

# 16. closed canopy -- steep
closed_steep = filter(parkntrail, CANOPY == 4, GRADE_TYPE == 4)

## 16 ggmaps using the above 16 filtered dataframes

base_map1 = base_map + scale_x_continuous(limits = c(-122.5, -122.2), 
          expand = c(0, 0)) + scale_y_continuous(limits = c(47.475, 47.75), 
                              expand = c(0, 0))
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which
## will replace the existing scale.
open_flat_map = base_map1 + geom_point(aes(x=lon, y=lat, color = CONDITION), 
      size = 3, alpha = 0.9, data = open_flat) + coord_fixed(1.6) +
  scale_color_manual(values=wes_palette(n=4, name="Darjeeling1")[c(1,2)])

open_gradual_map = base_map1 + geom_point(aes(x=lon, y=lat, color = CONDITION), 
  size = 3, alpha = 0.9, data = open_gradual) + coord_fixed(1.6) +
  scale_color_manual(values=wes_palette(n=4, name="Darjeeling1")[c(1,4,2)])

open_moderate_map = base_map1 + geom_point(aes(x=lon, y=lat, color = CONDITION), 
  size = 3, alpha = 0.9, data = open_moderate) + coord_fixed(1.6) +
  scale_color_manual(values=wes_palette(n=4, name="Darjeeling1")[2])

open_steep_map = base_map1 + geom_point(aes(x=lon, y=lat, color = CONDITION), 
 size = 3, alpha = 0.9, data = open_steep) + coord_fixed(1.6) +
  scale_color_manual(values=wes_palette(n=4, name="Darjeeling1")[2])

low_flat_map = base_map1 + geom_point(aes(x=lon, y=lat, color = CONDITION), 
  size = 3, alpha = 0.9, data = low_flat) + coord_fixed(1.6) +
  scale_color_manual(values=wes_palette(n=4, name="Darjeeling1")[2])

low_gradual_map = base_map1 + geom_point(aes(x=lon, y=lat, color = CONDITION), 
  size = 3, alpha = 0.9, data = low_gradual) + coord_fixed(1.6) +
  scale_color_manual(values=wes_palette(n=4, name="Darjeeling1")[c(1,2)])

low_moderate_map = base_map1 + geom_point(aes(x=lon, y=lat, color = CONDITION), 
  size = 3, alpha = 0.9, data = low_moderate) + coord_fixed(1.6) +
  scale_color_manual(values=wes_palette(n=4, name="Darjeeling1")[1])

low_steep_map = base_map1 + geom_point(aes(x=lon, y=lat, color = CONDITION),
  size = 3, alpha = 0.9, data = low_steep) + coord_fixed(1.6) +
  scale_color_manual(values=wes_palette(n=4, name="Darjeeling1")[1])

high_flat_map = base_map1 + geom_point(aes(x=lon, y=lat, color = CONDITION),
  size = 3, alpha = 0.9, data = high_flat) + coord_fixed(1.6) +
  scale_color_manual(values=wes_palette(n=4, name="Darjeeling1")[2])

high_gradual_map = base_map1 + geom_point(aes(x=lon, y=lat, color = CONDITION), 
  size = 3, alpha = 0.9, data = high_gradual) + coord_fixed(1.6) +
  scale_color_manual(values=wes_palette(n=4, name="Darjeeling1")[c(1,2)])

high_moderate_map = base_map1 + geom_point(aes(x=lon, y=lat, color = CONDITION), 
  size = 3, alpha = 0.9, data = high_moderate) + coord_fixed(1.6) +
  scale_color_manual(values=wes_palette(n=4, name="Darjeeling1")[c(1,2)])

high_steep_map = base_map1 + geom_point(aes(x=lon, y=lat, color = CONDITION),
  size = 3, alpha = 0.9, data = high_steep) + coord_fixed(1.6) +
  scale_color_manual(values=wes_palette(n=4, name="Darjeeling1")[c(1,2)])

closed_flat_map = base_map1 + geom_point(aes(x=lon, y=lat, color = CONDITION), 
 size = 3, alpha = 0.9, data = closed_flat) + coord_fixed(1.6) 

closed_gradual_map = base_map1 + geom_point(aes(x=lon, y=lat, color = CONDITION), 
  size = 3, alpha = 0.9, data = closed_gradual) + coord_fixed(1.6) +
  scale_color_manual(values=wes_palette(n=4, name="Darjeeling1")[1])

closed_moderate_map = base_map1 + geom_point(aes(x=lon, y=lat, color = CONDITION), 
  size = 3, alpha = 0.9, data = closed_moderate) + coord_fixed(1.6) +
  scale_color_manual(values=wes_palette(n=4, name="Darjeeling1")[c(1,2)])

closed_steep_map = base_map1 + geom_point(aes(x=lon, y=lat, color = CONDITION), 
  size = 3, alpha = 0.9, data = closed_steep) + coord_fixed(1.6) +
  scale_color_manual(values=wes_palette(n=4, name="Darjeeling1")[1])


## Align the maps as 4 by 4 map matrix

gg1 = ggarrange(closed_flat_map, closed_gradual_map, closed_moderate_map,
          closed_steep_map, ncol = 4, nrow = 1, legend = "none")  
## Warning: Removed 1 rows containing missing values (geom_rect).

## Warning: Removed 1 rows containing missing values (geom_rect).

## Warning: Removed 1 rows containing missing values (geom_rect).

## Warning: Removed 1 rows containing missing values (geom_rect).
gg2 = ggarrange(high_flat_map, high_gradual_map, high_moderate_map,
          high_steep_map, ncol = 4, nrow = 1, legend = "none")  
## Warning: Removed 1 rows containing missing values (geom_rect).

## Warning: Removed 1 rows containing missing values (geom_rect).

## Warning: Removed 1 rows containing missing values (geom_rect).

## Warning: Removed 1 rows containing missing values (geom_rect).
gg3 = ggarrange(low_flat_map, low_gradual_map, low_moderate_map,
          low_steep_map, ncol = 4, nrow = 1, legend = "none")  
## Warning: Removed 1 rows containing missing values (geom_rect).

## Warning: Removed 1 rows containing missing values (geom_rect).

## Warning: Removed 1 rows containing missing values (geom_rect).

## Warning: Removed 1 rows containing missing values (geom_rect).
gg4 = ggarrange(open_flat_map, open_gradual_map, open_moderate_map,
          open_steep_map, ncol = 4, nrow = 1, legend = "none")  
## Warning: Removed 1 rows containing missing values (geom_rect).

## Warning: Removed 1 rows containing missing values (geom_rect).

## Warning: Removed 1 rows containing missing values (geom_rect).

## Warning: Removed 1 rows containing missing values (geom_rect).
fig = ggarrange(gg1, gg2, gg3, gg4, ncol = 1, nrow = 4)
          
annotate_figure(fig, top = text_grob("Park Trails Condition by Canopy and Grade Type",
    color = "Black", size = 16),
left = text_grob("Canopy: open -- low -- high -- closed (from bottom to top)",
    color = "Black", size = 14, rot = 90), 
    bottom = text_grob("Grade Type: flat -- gradual -- moderate -- steep 
   (from left to right)", color = "Black", size = 14))

  • Red(condition: 1, worn/eroded), orange(condition:2, poor), cyan(condition:4, good)

  • Most of the trails in Seattle are in open canopy environment and flat/gradual/moderate gade types.

  • Steeper trails are usually under greater tree canopy(probably in the forest with some mountains).

  • Open canopy, flatter topography is correlated to better trail condition.

  • Higher canopy coverage and steeper slope is correlated to worse trail condition.

  • Very few parks in Seattle area have trails under greater canopy coverage and on flatter land. Very few parks have trails under low canopy coverage, let alone low canopy coverage and more challenging grade types.